# download_epaa_issue.py
# EPAA (Education Policy Analysis Archives) Downloader
# Automates downloading English-only PDFs from EPAA issues
# - Parses OJS issue pages
# - Filters by main 'PDF' label to skip Spanish/Portuguese versions
# - Skips Appendix PDFs
# - Creates issue-based folders and sanitizes filenames

import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

BASE = "https://epaa.asu.edu"
HEADERS = {"User-Agent": "Mozilla/5.0"}

# --- Ask user for issue URL ---
issue_url = input("Enter EPAA issue URL: ").strip()

# --- Get issue page ---
r = requests.get(issue_url, headers=HEADERS)
soup = BeautifulSoup(r.text, "html.parser")

# --- Create folder based on issue title ---
issue_title_tag = soup.find("h1", class_="page_title")
if issue_title_tag:
    folder_name = re.sub(r'[\\/*?:"<>|]', "_", issue_title_tag.get_text(strip=True))[:200]
else:
    folder_name = "EPAA_Issue"

os.makedirs(folder_name, exist_ok=True)

articles = soup.select(".obj_article_summary")
print(f"🔍 Found {len(articles)} articles")

count = 0
for art in articles:
    title_tag = art.find("h3", class_="title")
    if not title_tag:
        continue
    title = title_tag.get_text(strip=True)
    title = re.sub(r'[\\/*?:"<>|]', "", title)[:200]

    # Find PDF links
    pdf_links = art.select("a.obj_galley_link.pdf")
    for link in pdf_links:
        label = link.get_text(strip=True)
        # ✅ Only download main English PDFs, skip Appendix and other languages
        if label == "PDF" and "Appendix" not in label:
            pdf_url = urljoin(BASE, link["href"].replace("/view/", "/download/"))
            print(f"[{count+1}]⬇️ Downloading: {title}")
            try:
                pdf = requests.get(pdf_url, headers=HEADERS)
                if "application/pdf" not in pdf.headers.get("Content-Type", ""):
                    print(f"❌ Skipped (not PDF): {title}")
                    continue
                path = os.path.join(folder_name, f"{title}.pdf")
                with open(path, "wb") as f:
                    f.write(pdf.content)
                count += 1
                print(f"✅ Saved: {title}")
            except Exception as e:
                print(f"❌ Error downloading {title}: {e}")

print(f"\n🎉 Done! {count} English PDFs saved in {folder_name}")
